1   package org.apache.lucene.index;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import org.apache.lucene.document.Document;
21  import org.apache.lucene.document.DocumentStoredFieldVisitor;
22  import org.apache.lucene.store.AlreadyClosedException;
23  import org.apache.lucene.util.Bits;  // javadocs
24  import org.apache.lucene.util.IOUtils;
25  
26  import java.io.Closeable;
27  import java.io.IOException;
28  import java.util.Collections;
29  import java.util.LinkedHashSet;
30  import java.util.List;
31  import java.util.Set;
32  import java.util.WeakHashMap;
33  import java.util.concurrent.atomic.AtomicInteger;
34  
35  /**
36   IndexReader is an abstract class, providing an interface for accessing a
37   point-in-time view of an index.  Any changes made to the index
38   via {@link IndexWriter} will not be visible until a new
39   {@code IndexReader} is opened.  It's best to use {@link
40   DirectoryReader#open(IndexWriter,boolean)} to obtain an
41   {@code IndexReader}, if your {@link IndexWriter} is
42   in-process.  When you need to re-open to see changes to the
43   index, it's best to use {@link DirectoryReader#openIfChanged(DirectoryReader)}
44   since the new reader will share resources with the previous
45   one when possible.  Search of an index is done entirely
46   through this abstract interface, so that any subclass which
47   implements it is searchable.
48  
49   <p>There are two different types of IndexReaders:
50   <ul>
51    <li>{@link LeafReader}: These indexes do not consist of several sub-readers,
52    they are atomic. They support retrieval of stored fields, doc values, terms,
53    and postings.
54    <li>{@link CompositeReader}: Instances (like {@link DirectoryReader})
55    of this reader can only
56    be used to get stored fields from the underlying LeafReaders,
57    but it is not possible to directly retrieve postings. To do that, get
58    the sub-readers via {@link CompositeReader#getSequentialSubReaders}.
59    Alternatively, you can mimic an {@link LeafReader} (with a serious slowdown),
60    by wrapping composite readers with {@link SlowCompositeReaderWrapper}.
61   </ul>
62   
63   <p>IndexReader instances for indexes on disk are usually constructed
64   with a call to one of the static <code>DirectoryReader.open()</code> methods,
65   e.g. {@link DirectoryReader#open(org.apache.lucene.store.Directory)}. {@link DirectoryReader} implements
66   the {@link CompositeReader} interface, it is not possible to directly get postings.
67  
68   <p> For efficiency, in this API documents are often referred to via
69   <i>document numbers</i>, non-negative integers which each name a unique
70   document in the index.  These document numbers are ephemeral -- they may change
71   as documents are added to and deleted from an index.  Clients should thus not
72   rely on a given document having the same number between sessions.
73  
74   <p>
75   <a name="thread-safety"></a><p><b>NOTE</b>: {@link
76   IndexReader} instances are completely thread
77   safe, meaning multiple threads can call any of its methods,
78   concurrently.  If your application requires external
79   synchronization, you should <b>not</b> synchronize on the
80   <code>IndexReader</code> instance; use your own
81   (non-Lucene) objects instead.
82  */
83  public abstract class IndexReader implements Closeable {
84    
85    private boolean closed = false;
86    private boolean closedByChild = false;
87    private final AtomicInteger refCount = new AtomicInteger(1);
88  
89    IndexReader() {
90      if (!(this instanceof CompositeReader || this instanceof LeafReader))
91        throw new Error("IndexReader should never be directly extended, subclass LeafReader or CompositeReader instead.");
92    }
93    
94    /**
95     * A custom listener that's invoked when the IndexReader
96     * is closed.
97     *
98     * @lucene.experimental
99     */
100   public static interface ReaderClosedListener {
101     /** Invoked when the {@link IndexReader} is closed. */
102     public void onClose(IndexReader reader) throws IOException;
103   }
104 
105   private final Set<ReaderClosedListener> readerClosedListeners = 
106       Collections.synchronizedSet(new LinkedHashSet<ReaderClosedListener>());
107 
108   private final Set<IndexReader> parentReaders = 
109       Collections.synchronizedSet(Collections.newSetFromMap(new WeakHashMap<IndexReader,Boolean>()));
110 
111   /** Expert: adds a {@link ReaderClosedListener}.  The
112    * provided listener will be invoked when this reader is closed.
113    * At this point, it is safe for apps to evict this reader from
114    * any caches keyed on {@link #getCombinedCoreAndDeletesKey()}.
115    *
116    * @lucene.experimental */
117   public final void addReaderClosedListener(ReaderClosedListener listener) {
118     ensureOpen();
119     readerClosedListeners.add(listener);
120   }
121 
122   /** Expert: remove a previously added {@link ReaderClosedListener}.
123    *
124    * @lucene.experimental */
125   public final void removeReaderClosedListener(ReaderClosedListener listener) {
126     ensureOpen();
127     readerClosedListeners.remove(listener);
128   }
129   
130   /** Expert: This method is called by {@code IndexReader}s which wrap other readers
131    * (e.g. {@link CompositeReader} or {@link FilterLeafReader}) to register the parent
132    * at the child (this reader) on construction of the parent. When this reader is closed,
133    * it will mark all registered parents as closed, too. The references to parent readers
134    * are weak only, so they can be GCed once they are no longer in use.
135    * @lucene.experimental */
136   public final void registerParentReader(IndexReader reader) {
137     ensureOpen();
138     parentReaders.add(reader);
139   }
140 
141   private void notifyReaderClosedListeners(Throwable th) {
142     synchronized(readerClosedListeners) {
143       for(ReaderClosedListener listener : readerClosedListeners) {
144         try {
145           listener.onClose(this);
146         } catch (Throwable t) {
147           if (th == null) {
148             th = t;
149           } else {
150             th.addSuppressed(t);
151           }
152         }
153       }
154       IOUtils.reThrowUnchecked(th);
155     }
156   }
157 
158   private void reportCloseToParentReaders() {
159     synchronized(parentReaders) {
160       for(IndexReader parent : parentReaders) {
161         parent.closedByChild = true;
162         // cross memory barrier by a fake write:
163         parent.refCount.addAndGet(0);
164         // recurse:
165         parent.reportCloseToParentReaders();
166       }
167     }
168   }
169 
170   /** Expert: returns the current refCount for this reader */
171   public final int getRefCount() {
172     // NOTE: don't ensureOpen, so that callers can see
173     // refCount is 0 (reader is closed)
174     return refCount.get();
175   }
176   
177   /**
178    * Expert: increments the refCount of this IndexReader
179    * instance.  RefCounts are used to determine when a
180    * reader can be closed safely, i.e. as soon as there are
181    * no more references.  Be sure to always call a
182    * corresponding {@link #decRef}, in a finally clause;
183    * otherwise the reader may never be closed.  Note that
184    * {@link #close} simply calls decRef(), which means that
185    * the IndexReader will not really be closed until {@link
186    * #decRef} has been called for all outstanding
187    * references.
188    *
189    * @see #decRef
190    * @see #tryIncRef
191    */
192   public final void incRef() {
193     if (!tryIncRef()) {
194       ensureOpen();
195     }
196   }
197   
198   /**
199    * Expert: increments the refCount of this IndexReader
200    * instance only if the IndexReader has not been closed yet
201    * and returns <code>true</code> iff the refCount was
202    * successfully incremented, otherwise <code>false</code>.
203    * If this method returns <code>false</code> the reader is either
204    * already closed or is currently being closed. Either way this
205    * reader instance shouldn't be used by an application unless
206    * <code>true</code> is returned.
207    * <p>
208    * RefCounts are used to determine when a
209    * reader can be closed safely, i.e. as soon as there are
210    * no more references.  Be sure to always call a
211    * corresponding {@link #decRef}, in a finally clause;
212    * otherwise the reader may never be closed.  Note that
213    * {@link #close} simply calls decRef(), which means that
214    * the IndexReader will not really be closed until {@link
215    * #decRef} has been called for all outstanding
216    * references.
217    *
218    * @see #decRef
219    * @see #incRef
220    */
221   public final boolean tryIncRef() {
222     int count;
223     while ((count = refCount.get()) > 0) {
224       if (refCount.compareAndSet(count, count+1)) {
225         return true;
226       }
227     }
228     return false;
229   }
230 
231   /**
232    * Expert: decreases the refCount of this IndexReader
233    * instance.  If the refCount drops to 0, then this
234    * reader is closed.  If an exception is hit, the refCount
235    * is unchanged.
236    *
237    * @throws IOException in case an IOException occurs in  doClose()
238    *
239    * @see #incRef
240    */
241   public final void decRef() throws IOException {
242     // only check refcount here (don't call ensureOpen()), so we can
243     // still close the reader if it was made invalid by a child:
244     if (refCount.get() <= 0) {
245       throw new AlreadyClosedException("this IndexReader is closed");
246     }
247     
248     final int rc = refCount.decrementAndGet();
249     if (rc == 0) {
250       closed = true;
251       Throwable throwable = null;
252       try {
253         doClose();
254       } catch (Throwable th) {
255         throwable = th;
256       } finally {
257         try {
258           reportCloseToParentReaders();
259         } finally {
260           notifyReaderClosedListeners(throwable);
261         }
262       }
263     } else if (rc < 0) {
264       throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
265     }
266   }
267   
268   /**
269    * Throws AlreadyClosedException if this IndexReader or any
270    * of its child readers is closed, otherwise returns.
271    */
272   protected final void ensureOpen() throws AlreadyClosedException {
273     if (refCount.get() <= 0) {
274       throw new AlreadyClosedException("this IndexReader is closed");
275     }
276     // the happens before rule on reading the refCount, which must be after the fake write,
277     // ensures that we see the value:
278     if (closedByChild) {
279       throw new AlreadyClosedException("this IndexReader cannot be used anymore as one of its child readers was closed");
280     }
281   }
282   
283   /** {@inheritDoc}
284    * <p>For caching purposes, {@code IndexReader} subclasses are not allowed
285    * to implement equals/hashCode, so methods are declared final.
286    * To lookup instances from caches use {@link #getCoreCacheKey} and 
287    * {@link #getCombinedCoreAndDeletesKey}.
288    */
289   @Override
290   public final boolean equals(Object obj) {
291     return (this == obj);
292   }
293   
294   /** {@inheritDoc}
295    * <p>For caching purposes, {@code IndexReader} subclasses are not allowed
296    * to implement equals/hashCode, so methods are declared final.
297    * To lookup instances from caches use {@link #getCoreCacheKey} and 
298    * {@link #getCombinedCoreAndDeletesKey}.
299    */
300   @Override
301   public final int hashCode() {
302     return System.identityHashCode(this);
303   }
304 
305   /** Retrieve term vectors for this document, or null if
306    *  term vectors were not indexed.  The returned Fields
307    *  instance acts like a single-document inverted index
308    *  (the docID will be 0). */
309   public abstract Fields getTermVectors(int docID)
310           throws IOException;
311 
312   /** Retrieve term vector for this document and field, or
313    *  null if term vectors were not indexed.  The returned
314    *  Fields instance acts like a single-document inverted
315    *  index (the docID will be 0). */
316   public final Terms getTermVector(int docID, String field)
317     throws IOException {
318     Fields vectors = getTermVectors(docID);
319     if (vectors == null) {
320       return null;
321     }
322     return vectors.terms(field);
323   }
324 
325   /** Returns the number of documents in this index. */
326   public abstract int numDocs();
327 
328   /** Returns one greater than the largest possible document number.
329    * This may be used to, e.g., determine how big to allocate an array which
330    * will have an element for every document number in an index.
331    */
332   public abstract int maxDoc();
333 
334   /** Returns the number of deleted documents. */
335   public final int numDeletedDocs() {
336     return maxDoc() - numDocs();
337   }
338 
339   /** Expert: visits the fields of a stored document, for
340    *  custom processing/loading of each field.  If you
341    *  simply want to load all fields, use {@link
342    *  #document(int)}.  If you want to load a subset, use
343    *  {@link DocumentStoredFieldVisitor}.  */
344   public abstract void document(int docID, StoredFieldVisitor visitor) throws IOException;
345   
346   /**
347    * Returns the stored fields of the <code>n</code><sup>th</sup>
348    * <code>Document</code> in this index.  This is just
349    * sugar for using {@link DocumentStoredFieldVisitor}.
350    * <p>
351    * <b>NOTE:</b> for performance reasons, this method does not check if the
352    * requested document is deleted, and therefore asking for a deleted document
353    * may yield unspecified results. Usually this is not required, however you
354    * can test if the doc is deleted by checking the {@link
355    * Bits} returned from {@link MultiFields#getLiveDocs}.
356    *
357    * <b>NOTE:</b> only the content of a field is returned,
358    * if that field was stored during indexing.  Metadata
359    * like boost, omitNorm, IndexOptions, tokenized, etc.,
360    * are not preserved.
361    * 
362    * @throws CorruptIndexException if the index is corrupt
363    * @throws IOException if there is a low-level IO error
364    */
365   // TODO: we need a separate StoredField, so that the
366   // Document returned here contains that class not
367   // IndexableField
368   public final Document document(int docID) throws IOException {
369     final DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
370     document(docID, visitor);
371     return visitor.getDocument();
372   }
373 
374   /**
375    * Like {@link #document(int)} but only loads the specified
376    * fields.  Note that this is simply sugar for {@link
377    * DocumentStoredFieldVisitor#DocumentStoredFieldVisitor(Set)}.
378    */
379   public final Document document(int docID, Set<String> fieldsToLoad)
380       throws IOException {
381     final DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(
382         fieldsToLoad);
383     document(docID, visitor);
384     return visitor.getDocument();
385   }
386 
387   /** Returns true if any documents have been deleted. Implementers should
388    *  consider overriding this method if {@link #maxDoc()} or {@link #numDocs()}
389    *  are not constant-time operations. */
390   public boolean hasDeletions() {
391     return numDeletedDocs() > 0;
392   }
393 
394   /**
395    * Closes files associated with this index.
396    * Also saves any new deletions to disk.
397    * No other methods should be called after this has been called.
398    * @throws IOException if there is a low-level IO error
399    */
400   @Override
401   public final synchronized void close() throws IOException {
402     if (!closed) {
403       decRef();
404       closed = true;
405     }
406   }
407   
408   /** Implements close. */
409   protected abstract void doClose() throws IOException;
410 
411   /**
412    * Expert: Returns the root {@link IndexReaderContext} for this
413    * {@link IndexReader}'s sub-reader tree. 
414    * <p>
415    * Iff this reader is composed of sub
416    * readers, i.e. this reader being a composite reader, this method returns a
417    * {@link CompositeReaderContext} holding the reader's direct children as well as a
418    * view of the reader tree's atomic leaf contexts. All sub-
419    * {@link IndexReaderContext} instances referenced from this readers top-level
420    * context are private to this reader and are not shared with another context
421    * tree. For example, IndexSearcher uses this API to drive searching by one
422    * atomic leaf reader at a time. If this reader is not composed of child
423    * readers, this method returns an {@link LeafReaderContext}.
424    * <p>
425    * Note: Any of the sub-{@link CompositeReaderContext} instances referenced
426    * from this top-level context do not support {@link CompositeReaderContext#leaves()}.
427    * Only the top-level context maintains the convenience leaf-view
428    * for performance reasons.
429    */
430   public abstract IndexReaderContext getContext();
431   
432   /**
433    * Returns the reader's leaves, or itself if this reader is atomic.
434    * This is a convenience method calling {@code this.getContext().leaves()}.
435    * @see IndexReaderContext#leaves()
436    */
437   public final List<LeafReaderContext> leaves() {
438     return getContext().leaves();
439   }
440 
441   /** Expert: Returns a key for this IndexReader, so CachingWrapperFilter can find
442    * it again.
443    * This key must not have equals()/hashCode() methods, so &quot;equals&quot; means &quot;identical&quot;. */
444   public Object getCoreCacheKey() {
445     // Don't call ensureOpen since FC calls this (to evict)
446     // on close
447     return this;
448   }
449 
450   /** Expert: Returns a key for this IndexReader that also includes deletions,
451    * so CachingWrapperFilter can find it again.
452    * This key must not have equals()/hashCode() methods, so &quot;equals&quot; means &quot;identical&quot;. */
453   public Object getCombinedCoreAndDeletesKey() {
454     // Don't call ensureOpen since FC calls this (to evict)
455     // on close
456     return this;
457   }
458   
459   /** Returns the number of documents containing the 
460    * <code>term</code>.  This method returns 0 if the term or
461    * field does not exists.  This method does not take into
462    * account deleted documents that have not yet been merged
463    * away. 
464    * @see TermsEnum#docFreq()
465    */
466   public abstract int docFreq(Term term) throws IOException;
467   
468   /**
469    * Returns the total number of occurrences of {@code term} across all
470    * documents (the sum of the freq() for each doc that has this term). This
471    * will be -1 if the codec doesn't support this measure. Note that, like other
472    * term measures, this measure does not take deleted documents into account.
473    */
474   public abstract long totalTermFreq(Term term) throws IOException;
475   
476   /**
477    * Returns the sum of {@link TermsEnum#docFreq()} for all terms in this field,
478    * or -1 if this measure isn't stored by the codec. Note that, just like other
479    * term measures, this measure does not take deleted documents into account.
480    * 
481    * @see Terms#getSumDocFreq()
482    */
483   public abstract long getSumDocFreq(String field) throws IOException;
484   
485   /**
486    * Returns the number of documents that have at least one term for this field,
487    * or -1 if this measure isn't stored by the codec. Note that, just like other
488    * term measures, this measure does not take deleted documents into account.
489    * 
490    * @see Terms#getDocCount()
491    */
492   public abstract int getDocCount(String field) throws IOException;
493 
494   /**
495    * Returns the sum of {@link TermsEnum#totalTermFreq} for all terms in this
496    * field, or -1 if this measure isn't stored by the codec (or if this fields
497    * omits term freq and positions). Note that, just like other term measures,
498    * this measure does not take deleted documents into account.
499    * 
500    * @see Terms#getSumTotalTermFreq()
501    */
502   public abstract long getSumTotalTermFreq(String field) throws IOException;
503 
504 }